Examine PCR enrichment data

Organize single cell libraries

First designate the libraries and the cells that were resampled.

libs <- list("original",
             "standard", 
             "biotinylated", 
             "phosphorothioate")

cells <- list(
  standard = c(
    "Cell_4Bone_63_70",
    "Cell_4Bone_27_61",
    "Cell_4Bone_34_48",
    "Cell_1Tumor_0_55",
    "Cell_1Tumor_7_50",
    "Cell_4Bone_30_23",
    "Cell_4Bone_61_22",
    "Cell_4Bone_66_9",
    "Cell_3Brain_47_38",
    "Cell_3Brain_14_46"
  ),
  biotinylated = c(
    "Cell_4Bone_63_70",
    "Cell_4Bone_27_61",
    "Cell_4Bone_34_48",
    "Cell_1Tumor_0_55",
    "Cell_1Tumor_7_50",
    "Cell_3Brain_47_38",
    "Cell_3Brain_14_46",
    "Cell_4Bone_28_54",
    "Cell_4Bone_26_20",
    "Cell_4Bone_60_43"
  ),
  phosphorothioate = c(
    "Cell_1Tumor_7_50",
    "Cell_3Brain_47_38",
    "Cell_4Bone_28_54",
    "Cell_4Bone_26_20",
    "Cell_4Bone_60_43" 
  )
)

bc_metadat <- read_tsv(file.path(data_dir, 
                       "original",
                       "fastq",
                       "original",
                       "well_data_barcode_keys.txt"),
                       col_names = c("cell_id", "barcode_10x"))

## original library to compare against
reflib <- "original"
resampled_libs <- c(
             "standard", 
             "biotinylated", 
             "phosphorothioate")

## reference resampled lib for resampled vs control plots
resampled_lib <- "phosphorothioate"


## pretty name for libraries
lib_names = c(
  original = "Original Library",
  standard = "PCR",
  biotinylated = "PCR +\n5' Biotin",
  phosphorothioate = "PCR +\n5' Biotin +\n3' Phosphorothioates"
  )

## pretty names for cells

cell_names <- list(
  standard = c(
    "Cell_4Bone_63_70" = "Bone Met Cell #4", 
    "Cell_4Bone_27_61" = "Bone Met Cell #5",
    "Cell_4Bone_34_48" = "Bone Met Cell #6",
    "Cell_1Tumor_0_55" = "Primary Tumor Cell #2",
    "Cell_1Tumor_7_50" = "Primary Tumor Cell #1",
    "Cell_4Bone_30_23" = "Bone Met Cell #7",
    "Cell_4Bone_61_22" = "Bone Met Cell #8",
    "Cell_4Bone_66_9" = "Bone Met Cell #9",
    "Cell_3Brain_47_38" = "Brain Met Cell #1",
    "Cell_3Brain_14_46" = "Brain Met Cell #2"
  ),
  biotinylated = c(
    "Cell_4Bone_63_70" = "Bone Met Cell #4", 
    "Cell_4Bone_27_61" = "Bone Met Cell #5",
    "Cell_4Bone_34_48" = "Bone Met Cell #6",
    "Cell_1Tumor_0_55" = "Primary Tumor Cell #2",
    "Cell_1Tumor_7_50" = "Primary Tumor Cell #1",
    "Cell_3Brain_47_38" = "Brain Met Cell #1",
    "Cell_3Brain_14_46" = "Brain Met Cell #2",
    "Cell_4Bone_28_54" = "Bone Met Cell #1",
    "Cell_4Bone_26_20" = "Bone Met Cell #2",
    "Cell_4Bone_60_43" = "Bone Met Cell #3" 
  ),
  phosphorothioate = c(
    "Cell_1Tumor_7_50" = "Primary Tumor Cell #1",
    "Cell_3Brain_47_38" = "Brain Met Cell #1",
    "Cell_4Bone_28_54" = "Bone Met Cell #1",
    "Cell_4Bone_26_20" = "Bone Met Cell #2",
    "Cell_4Bone_60_43" = "Bone Met Cell #3" 
  )
)

Load and organize a table for each library of read counts per cell per gene, and a table of umi counts per cell per gene.

umis_to_genes <- function(umipath, cells_to_exclude = c("Cell_unmatched")){
  umis <- read_tsv(umipath,
                   col_names = c("barcode_10x", 
                                 "umi_molecule", 
                                 "count")) %>% 
    filter(barcode_10x != cells_to_exclude)
  
  mol_fields <- str_count(umis$umi_molecule[1], "::")
  
  if(mol_fields == 2 ){
    umis <- separate(umis, umi_molecule, 
                     into = c("seq", "genome", "gene"),
                     sep = "::") %>% 
      mutate(gene = str_c(genome, "::", gene))
  } else if (mol_fields == 1){
    umis <- separate(umis, umi_molecule, 
                     into = c("seq", "gene"),
                     sep = "::")
  } else {
    stop("separator :: missing from umi_molecule field")
  }
  
  reads <- select(umis, 
                  barcode_10x, 
                  gene,
                  count)
  
  reads <- group_by(reads, 
                   barcode_10x, gene) %>% 
    summarize(counts = sum(count))
  
  reads <- spread(reads, barcode_10x, counts, 
                  fill = 0L)
  
  reads
}

## read in umigroups flat file with read counts per umi per gene per cell
## expand out to a read count matrix
umipaths <- file.path(data_dir, 
                      libs, 
                      "umis",
                      "umigroups.txt.gz")
read_dat <- map(umipaths, 
                ~umis_to_genes(.))
names(read_dat) <- libs

## read in umi_tools count table with umi counts per gene per cell
umi_dat <- map(libs, 
                ~read_tsv(file.path(data_dir, 
                          .x,
                          "dgematrix",
                          "dge_matrix.txt")) %>% 
                 select(-matches("Cell_unmatched")))
names(umi_dat) <- libs


all_cells <- c(list(original = unlist(cells) %>% unname()), 
               cells)

cell_obj_mdata <- map(all_cells, 
                      ~mutate(bc_metadat, 
                              resampled = ifelse(cell_id %in% .x,
                                                  TRUE,
                                                  FALSE)))

Next organize these tables into simple classes called resampled-sets to keep track of each experiment’s relavant raw, processed, and meta data.

#' simple class to hold info for each experiment
create_sc_obj <- function(umi_df,
                          read_df,
                          cell_mdata_df){
  x <- list()
  class(x) <- "resampled-set"
  x$umis <- umi_df
  x$reads <- read_df
  x$meta_dat <- cell_mdata_df
  return(x)
}

sc_objs <- list(umi_dat, read_dat, cell_obj_mdata)
sc_objs <- pmap(sc_objs, create_sc_obj)

rm(umi_dat)
rm(read_dat)

Next perform basic processing. 1) generate separate objects to store sparse matrices of umi and read counts. 2) normalize read and umi count data by total library size (sum of all read or umi counts for all cells in the experiment) and report as Reads per million or UMIs per million. 3) Compute per cell metrics (read and umi counts, sequencing saturation)

tidy_to_matrix <- function(df){
   df <- as.data.frame(df)
   rownames(df) <- df[, 1]
   df[, 1] <- NULL
   mat <- as.matrix(df)
   mat <- as(mat, "sparseMatrix")   
   return(mat)
}

#' keep both tidy and matrix objs
generate_matrices <- function(sc_obj){
  sc_obj$umi_matrix <- tidy_to_matrix(sc_obj$umis)
  sc_obj$read_matrix <- tidy_to_matrix(sc_obj$reads)
  sc_obj
}

#' normalize by library size (Reads per Million)
norm_libsize <- function(sc_obj){
  sc_obj$norm_umi <- 1e6 * sweep(sc_obj$umi_matrix, 2, 
                                 sum(as.vector(sc_obj$umi_matrix)), "/")
  sc_obj$norm_reads <- 1e6 * sweep(sc_obj$read_matrix, 2, 
                                   sum(as.vector(sc_obj$read_matrix)), "/")
  sc_obj
}

add_metadata <- function(sc_obj, dat){
  if (is.vector(dat)){
    new_colname <- deparse(substitute(dat))
    df <- data_frame(!!new_colname := dat)
    df[[new_colname]] <- dat
    df[["cell_id"]] <- names(dat)
    sc_obj$meta_dat <- left_join(sc_obj$meta_dat,
                                 df,
                                 by = "cell_id")
    
  } else if (is.data.frame(dat)) {
    sc_obj$meta_dat <- left_join(sc_obj$meta_dat,
                                 dat,
                                 by = "cell_id")
  }
  sc_obj
}

compute_summaries <- function(sc_obj){
  ## raw counts
  total_umis <- colSums(sc_obj$umi_matrix)
  names(total_umis) <- colnames(sc_obj$umi_matrix)
  total_reads <- colSums(sc_obj$read_matrix)
  names(total_reads) <- colnames(sc_obj$read_matrix)
  
  ## norm counts
  norm_total_umis <- colSums(sc_obj$norm_umi)
  names(norm_total_umis) <- colnames(sc_obj$norm_umi)
  norm_total_reads <- colSums(sc_obj$norm_reads)
  names(norm_total_reads) <- colnames(sc_obj$norm_reads)
    
  sc_obj <- add_metadata(sc_obj, total_umis)
  sc_obj <- add_metadata(sc_obj, total_reads)
  sc_obj <- add_metadata(sc_obj, norm_total_umis)
  sc_obj <- add_metadata(sc_obj, norm_total_reads)
  
  ## compute cDNA duplication rate 
  sc_obj$meta_dat$cDNA_duplication <- 1 - (sc_obj$meta_dat$total_umis /
                                             sc_obj$meta_dat$total_reads)
  
  sc_obj
}

sc_objs <- map(sc_objs, generate_matrices)
sc_objs <- map(sc_objs, norm_libsize)
sc_objs <- map(sc_objs, compute_summaries)

Compute enrichment of reads/umis over the original library.

sc_objs <- map(sc_objs,
    function(sub_dat){
      og_counts <- select(sc_objs[[reflib]]$meta_dat,
                          og_total_reads = total_reads,
                          og_total_umis = total_umis,
                          og_norm_total_umis = norm_total_umis,
                          og_norm_total_reads = norm_total_reads,
                          og_cDNA_duplication = cDNA_duplication,
                          cell_id)
      sub_dat$meta_dat <- left_join(sub_dat$meta_dat,
                         og_counts, 
                         by = "cell_id")
      
      sub_dat$meta_dat <- mutate(sub_dat$meta_dat,
                                 read_proportion = log2( total_reads / og_total_reads),
                                 umi_proportion = log2( total_umis / og_total_umis),
                                 norm_read_proportion = log2( norm_total_reads /
                                                                og_norm_total_reads),
                                 norm_umi_proportion = log2( norm_total_umis /
                                                               og_norm_total_umis))
      sub_dat
    })

plot cDNA duplication rate

sc_metadat <- map(sc_objs, ~.x$meta_dat) %>% 
  bind_rows(.id = "library") %>% 
  mutate(library = factor(library, levels = libs)) %>% 
  arrange(resampled)

plt <- ggplot(sc_metadat, aes(total_umis, cDNA_duplication)) +
  geom_point(aes(color = resampled), size = 0.5) +
  labs(x = expression(paste("# of UMIs")),
       y = "Sequencing saturation") + 
  scale_x_log10(labels = scales::comma) +
  scale_color_manual(values = color_palette) +
  guides(colour = guide_legend(override.aes = list(size = 5))) + 
  facet_wrap(~library,
             labeller = labeller(library = lib_names)) +
  theme_cowplot(font_size = 16, line_size = 1)  +
  theme(legend.position = "top",
        plot.margin = unit(c(5.5, 20.5, 5.5, 5.5), 
                           "points")) 

plt
Note the high level of sequencing saturation (0 = no-duplication, 1 = all duplicates) in the original library. Also note that the libraries tend to have higher saturatioin rates, after subsampling.

Note the high level of sequencing saturation (0 = no-duplication, 1 = all duplicates) in the original library. Also note that the libraries tend to have higher saturatioin rates, after subsampling.

save_plot("cDNA_duplication.pdf", plt, 
          base_height = 5.5,
          base_aspect_ratio = 0.85)

plot read and umi counts per library

global_plot_theme <- theme(
        legend.position = "top",
        legend.text = element_text(size = 10),
        strip.text = element_text(size = 8))

resampled_metadat <- filter(sc_metadat,
                             library != "original") %>% 
  mutate(library = factor(library, 
                          levels = c(
                               "standard",
                               "biotinylated",
                               "phosphorothioate")))

unnorm_plt <- ggplot(resampled_metadat, 
                     aes(og_total_reads / 3, total_reads / 3, colour = resampled)) + 
  geom_point(size = 0.5) + 
  geom_abline(slope = 1) +
  facet_wrap(~library, nrow = 1) + 
  coord_fixed() +
  xlab("original library\nreads count (Thousands)") +
  ylab("resampled library\nreads count (Thousands)") +
 # ggtitle("Raw reads associated with each cell barcode") +
  scale_colour_manual(name = "Resampled:",
                      values = color_palette) +
  theme_cowplot(font_size = 10, line_size = .5) +
  global_plot_theme

norm_plt <- ggplot(resampled_metadat, aes(og_norm_total_reads / 1e3, norm_total_reads / 1e3, colour = resampled)) + 
  geom_point(size = 0.5) + 
  geom_abline(slope = 1) + 
  facet_wrap(~library, nrow = 1) + 
  xlab("original library\nRPM (Thousands)") +
  ylab("resampled library\nRPM (Thousands)") +
  scale_colour_manual(name = "Resampled:",
                      values = color_palette) +
  theme_cowplot(font_size = 10, line_size = 0.5) +
  theme(aspect.ratio = 1) + 
  global_plot_theme

unnorm_umi_plt <- ggplot(resampled_metadat, 
                         aes(og_total_umis / 1e3, total_umis / 1e3, colour = resampled)) + 
  geom_point(size = 0.5) + 
  geom_abline(slope = 1) +
  coord_fixed() +
  facet_wrap(~library, nrow = 1) + 
  xlab("original library\nUMI count (Thousands)") +
  ylab("resampled library\nUMI count (Thousands)") +
  scale_colour_manual(name = "Resampled:",
                      values = color_palette) +
  theme_cowplot(font_size = 10, line_size = .5) +
  global_plot_theme

norm_umi_plt <- ggplot(resampled_metadat, 
                       aes(og_norm_total_umis / 1e3, norm_total_umis / 1e3, colour = resampled)) + 
  geom_point(size = 0.5) + 
  geom_abline(slope = 1) + 
  facet_wrap(~library, nrow = 1) + 
  xlab("original library\nUMI normalized RPM (Thousands)") +
  ylab("resampled library\nUMIs per Million (Thousands)") +
  scale_colour_manual(name = "Resampled:",
                      values = color_palette) +
  theme_cowplot(font_size = 10, line_size = 0.5) +
  theme(aspect.ratio = 1) + 
  global_plot_theme

plt <- plot_grid(unnorm_plt, norm_plt, unnorm_umi_plt, norm_umi_plt, 
                 labels = "AUTO",
                 align = 'hv')
plt

save_plot("reads_per_barcode_scatterplots.pdf", plt, base_width = 8 )

Plot enrichment of umis

umi_plots <- map(split(resampled_metadat, resampled_metadat$library),
  function(x){
    ggplot(x, 
       aes(og_total_umis, norm_umi_proportion, colour = resampled)) + 
  geom_point(size = 0.5) + 
  geom_hline(aes(yintercept = 0), 
             linetype ="dashed", 
             color = "darkgrey") + 
  scale_x_continuous(labels = scales::comma) + 
  labs(x = "Abundance in original\nlibrary (UMIs)", 
       y = expression(paste( "Log"[2], " Normalized UMIs ", 
                             frac("resampled", "original")))) +
  scale_colour_manual(name = "Resampled:", values = color_palette) +
  guides(colour = guide_legend(override.aes = list(size = 5))) + 
  theme_cowplot(font_size = 16, line_size = 1) +
  theme(legend.position = "top",
        legend.text = element_text(size = 12))})

plt <- plot_grid(plotlist = umi_plots, nrow = 1)
save_plot("umi_ratio_MA.pdf", plt, 
          ncol = 3, nrow = 1, 
          base_width = 5, 
          base_height = 5)
ggplot(resampled_metadat, aes(resampled, 
                read_proportion, fill = resampled)) + 
  geom_boxplot(coef = Inf) + 
  facet_wrap(~library, nrow = 1) +
  xlab("Selected for Reamplification") +
  ylab(expression(paste( "Log"[2], " Reads ", frac("resampled", "original")))) +
  scale_fill_manual(name = "Resampled:",
                    values = color_palette) +
  theme_cowplot(font_size = 16, line_size = 0.5) +
  theme(
    axis.title.x = element_blank(),
    axis.text.x = element_blank(),
    legend.position = "top",
    legend.text = element_text(size = 12)
  )

ggsave("reads_ratio_per_barcode_boxplot.pdf", width = 6, height = 5)

ggplot(resampled_metadat, aes(resampled, 
                umi_proportion, fill = resampled)) + 
  geom_boxplot(coef = Inf) + 
  facet_wrap(~library, nrow = 1) +
  xlab("Selected for Reamplification") +
  ylab(expression(paste( "Log"[2], " UMIs ", frac("resampled", "original")))) +
  scale_fill_manual(name = "Resampled:",
                    values = color_palette) +
  theme_cowplot(font_size = 16, line_size = 0.5) +
  theme(
    axis.title.x = element_blank(),
    axis.text.x = element_blank(),
    legend.position = "top",
    legend.text = element_text(size = 12)
  )

ggsave("umi_ratio_per_barcode_boxplot.pdf", width = 6, height = 5)

ggplot(resampled_metadat, aes(resampled, 
                norm_read_proportion, fill = resampled)) + 
  geom_boxplot(coef = Inf) + 
  facet_wrap(~library, nrow = 1) +
  xlab("Selected for Reamplification") +
  ylab(expression(paste( "Log"[2], " normalized Reads ", frac("resampled", "original")))) +
  scale_fill_manual(name = "Resampled:",
                    values = color_palette) +
  theme_cowplot(font_size = 16, line_size = 0.5) +
  theme(
    axis.title.x = element_blank(),
    axis.text.x = element_blank(),
    legend.position = "top",
    legend.text = element_text(size = 12)
  )

ggsave("norm_reads_ratio_per_barcode_boxplot.pdf", width = 6, height = 5)

ggplot(resampled_metadat, aes(resampled, 
                norm_umi_proportion, fill = resampled)) + 
  geom_boxplot(coef = Inf) + 
  facet_wrap(~library, nrow = 1) +
  xlab("Selected for Reamplification") +
  ylab(expression(paste( "Log"[2], " normalized UMIs ", frac("resampled", "original")))) +
  scale_fill_manual(name = "Resampled:",
                    values = color_palette) +
  theme_cowplot(font_size = 16, line_size = 0.5) +
  theme(
    axis.title.x = element_blank(),
    axis.text.x = element_blank(),
    legend.position = "top",
    legend.text = element_text(size = 12)
  )

ggsave("norm_umi_ratio_per_barcode_boxplot.pdf", width = 6, height = 5)
dat <- group_by(resampled_metadat, library) %>%
  filter(library != "original") %>% 
  mutate(total_new = sum(total_reads, na.rm = T), 
         total_old = sum(og_total_reads, na.rm = T))

dat_group <- group_by(dat, library, resampled) %>% 
  summarize(total_new = sum(total_reads, 
                            na.rm = T) / unique(total_new), 
            total_old = sum(og_total_reads, 
                            na.rm = T) / unique(total_old)) %>% 
  gather(lib, percent_lib, -library, -resampled ) %>%
  mutate(lib = factor(lib, levels = c("total_old", "total_new"), 
                      labels = c("Original\nLibrary",
                                 "Resampled\nLibrary")),
         resampled = factor(resampled, levels = c( "TRUE", "FALSE"))) %>% 
  arrange(resampled)

plt <- ggplot(dat_group, aes(lib, percent_lib, fill = resampled)) + 
  geom_bar(stat = "identity") + 
  ylab("Fraction of Reads Assigned") +
  scale_fill_manual(name = "Resampled:",
                    values = rev(color_palette)) +
  facet_wrap(~library, labeller = labeller(library = lib_names)) + 
  theme_cowplot(font_size = 16, line_size = 1) +
  theme(
    axis.title.x = element_blank(),
    axis.text.x = element_text(angle = 90, hjust = 0.5, vjust = 0.5),
    legend.position = "top",
    legend.text = element_text(size = 16),
    strip.text = element_text(margin = margin(2, 2, 2, 2, "mm"))
    )

save_plot("proportion_reads_all_barcode_barplot.pdf", 
          plt,
          base_height = 7,
          base_aspect_ratio = 1)

dat_group %>% 
 rename(Method = library) %>% 
  spread(lib, percent_lib) %>% 
  mutate(`Targeted Library Read Fold-Enrichment` = `Resampled\nLibrary` / `Original\nLibrary`) %>%
  filter(resampled == T) %>% 
  select(Method, `Targeted Library Read Fold-Enrichment`)

Compare species

add_species_counts <- function(sc_obj, 
                               mouse_gene_prefix = "mm38::",
                               human_gene_prefix = "hg38::"){
  
  ## get mouse and human reads
  g_ids <- rownames(sc_obj$read_matrix)
  mouse_ids <- str_subset(g_ids, str_c("^", mouse_gene_prefix))
  human_ids <- str_subset(g_ids, str_c("^", human_gene_prefix))
  
  mouse_reads = colSums(sc_obj$read_matrix[mouse_ids, ])
  human_reads = colSums(sc_obj$read_matrix[human_ids, ])
  
  ## get mouse and human UMIs
  g_ids <- rownames(sc_obj$umi_matrix)
  mouse_ids <- str_subset(g_ids, str_c("^", mouse_gene_prefix))
  human_ids <- str_subset(g_ids, str_c("^", human_gene_prefix))
  
  mouse_umis = colSums(sc_obj$umi_matrix[mouse_ids, ])
  human_umis = colSums(sc_obj$umi_matrix[human_ids, ])
  
  ## get norm counts for reads 
  g_ids <- rownames(sc_obj$norm_reads)
  mouse_ids <- str_subset(g_ids, str_c("^", mouse_gene_prefix))
  human_ids <- str_subset(g_ids, str_c("^", human_gene_prefix))
  
  norm_human_reads <- colSums(sc_obj$norm_reads[human_ids, ])
  norm_mouse_reads <- colSums(sc_obj$norm_reads[mouse_ids, ])
  
  ## get norm counts for umis
  g_ids <- rownames(sc_obj$norm_umi)
  mouse_ids <- str_subset(g_ids, str_c("^", mouse_gene_prefix))
  human_ids <- str_subset(g_ids, str_c("^", human_gene_prefix))
  
  norm_human_umis <- colSums(sc_obj$norm_umi[human_ids, ])
  norm_mouse_umis <- colSums(sc_obj$norm_umi[mouse_ids, ])
  
  sc_obj <- add_metadata(sc_obj, human_reads)
  sc_obj <- add_metadata(sc_obj, mouse_reads)
  sc_obj <- add_metadata(sc_obj, human_umis)
  sc_obj <- add_metadata(sc_obj, mouse_umis)
  sc_obj <- add_metadata(sc_obj, norm_human_reads)
  sc_obj <- add_metadata(sc_obj, norm_mouse_reads)
  sc_obj <- add_metadata(sc_obj, norm_human_umis)
  sc_obj <- add_metadata(sc_obj, norm_mouse_umis)
  
  ## make sure mouse + human == total
  stopifnot(all(sc_obj$meta_dat$total_reads == sc_obj$meta_dat$mouse_reads + 
                                               sc_obj$meta_dat$human_reads, na.rm = T))
  
  stopifnot(all(sc_obj$meta_dat$total_umis == sc_obj$meta_dat$mouse_umis + 
                                               sc_obj$meta_dat$human_umis, na.rm = T))
  ## check floating point totals
  tol <- 1e-5
  reads_check <- all(abs(sc_obj$meta_dat$norm_total_reads - (sc_obj$meta_dat$norm_mouse_reads + 
                                               sc_obj$meta_dat$norm_human_reads)) <= tol, na.rm = T)
  stopifnot(reads_check)
  
  umis_check <- all(abs(sc_obj$meta_dat$norm_total_umis - (sc_obj$meta_dat$norm_mouse_umis + 
                                               sc_obj$meta_dat$norm_human_umis)) <= tol, na.rm = T)
  stopifnot(umis_check)
  ## calculate species purity (human / human + mouse)
  sc_obj$meta_dat <- mutate(sc_obj$meta_dat, 
                            purity_reads = human_reads / (human_reads + mouse_reads),
                            purity_umis = human_umis / (human_umis + mouse_umis))
  sc_obj
}

sc_objs <- map(sc_objs, add_species_counts)
## add in metadat columns for original mouse and original human data
sc_objs <- map(sc_objs,
    function(sub_dat){
      og_dat <- select(sc_objs$original$meta_dat,
                          cell_id,
                          str_subset(colnames(sc_objs$original$meta_dat),
                                     "human|mouse|purity"))
                          
      cols <- colnames(og_dat)
      new_cols <- c("cell_id", str_c("og_", cols[2:length(cols)]))
      colnames(og_dat) <- new_cols
      
      sub_dat$meta_dat <- left_join(sub_dat$meta_dat,
                         og_dat, 
                         by = "cell_id")
      
      sub_dat$meta_dat <- mutate(sub_dat$meta_dat,
                                 norm_human_read_proportion = log2( norm_human_reads /
                                                                og_norm_human_reads),
                                 norm_human_umi_proportion = log2( norm_human_umis /
                                                               og_norm_human_umis),
                                 norm_mouse_read_proportion = log2( norm_mouse_reads /
                                                                og_norm_mouse_reads),
                                 norm_mouse_umi_proportion = log2( norm_mouse_umis /
                                                               og_norm_mouse_umis))
      
      sub_dat
    })


resampled_metadat <- map(sc_objs, ~.x$meta_dat) %>% 
  bind_rows(.id = "library") %>% 
   mutate(library = factor(library, 
                          levels = c(
                               "original",
                               "standard",
                               "biotinylated",
                               "phosphorothioate"))) %>% 
  arrange(resampled)
read_plots <- map(split(resampled_metadat, resampled_metadat$library),
  function(x){
  ggplot(x, 
         aes(human_reads, mouse_reads, 
                               colour = resampled)) +
  geom_point(size = 0.5) + 
  facet_wrap(~library, nrow = 1, scales = "free",
             labeller = labeller(library = lib_names)) + 
  xlab("Human reads") +
  ylab("Mouse reads") +
  scale_colour_manual(name = "Resampled:",
                      values = color_palette) +
  scale_x_continuous(labels = scales::comma) +
  scale_y_continuous(labels = scales::comma) +
  guides(colour = guide_legend(override.aes = list(size = 5))) + 
  theme_cowplot(font_size = 16, line_size = 1) +
  theme(legend.position = "top",
        legend.text = element_text(size = 12))
  })
plt <- plot_grid(plotlist = read_plots,
                 nrow = 1)
save_plot("read_scatterplot_human_mouse.pdf", 
          plt, ncol = length(libs), nrow = 1,
          base_height = 5, base_aspect_ratio = 1)

umi_plots <- map(split(resampled_metadat, resampled_metadat$library),
  function(x){
  ggplot(x,
       aes(human_umis, 
           mouse_umis, 
           colour = resampled)) +
  geom_point(size = 0.5) + 
   facet_wrap(~library, nrow = 1, scales = "free",
             labeller = labeller(library = lib_names)) + 
  xlab("Human UMIs") +
  ylab("Mouse UMIs") +
  scale_colour_manual(name = "Resampled:",
                      values = color_palette) +
  scale_x_continuous(labels = scales::comma) +
  scale_y_continuous(labels = scales::comma) +
  guides(colour = guide_legend(override.aes = list(size = 5))) + 
  theme_cowplot(font_size = 16, line_size = 1) +
  theme(legend.position = "top",
        legend.text = element_text(size = 12))
  })

plt <- plot_grid(plotlist = umi_plots,
                 nrow = 1)

save_plot("umi_scatterplot_human_mouse.pdf", 
          plt, ncol = length(libs), nrow = 1,
          base_height = 5, base_aspect_ratio = 1)

Genes detected

## compute per gene or per gene/umi combo enrichment
detected_molecules <- function(sc_obj, molecule = "gene"){
  umis <- sc_obj$umi_matrix
  if (molecule == "gene"){
    human_mat <- umis[str_detect(rownames(umis), "hg38::"), ]
    mouse_mat <- umis[str_detect(rownames(umis), "mm38::"), ]
    n_human_genes <- colSums(human_mat > 0)
    n_mouse_genes <- colSums(mouse_mat > 0)
    out_mdat <- data_frame(cell_id = colnames(umis),
      n_human_genes = n_human_genes,
      n_mouse_genes = n_mouse_genes)
    sc_obj <- add_metadata(sc_obj, out_mdat)
    }
  
}
sc_objs <- map(sc_objs, ~detected_molecules(.x))
resampled_metadat <- map(sc_objs, ~.x$meta_dat) %>% 
  bind_rows(.id = "library") %>% 
   mutate(library = factor(library, 
                          levels = c(
                               "original",
                               "standard",
                               "biotinylated",
                               "phosphorothioate")))

og_genes <- filter(resampled_metadat, 
                   library == "original") %>% 
  dplyr::select(cell_id, 
                og_human_genes = n_human_genes, 
                og_mouse_genes = n_mouse_genes)

resampled_metadat <- left_join(resampled_metadat, 
                                og_genes,
                                by = "cell_id")

ggplot(resampled_metadat, aes(og_human_genes,
                               n_human_genes, colour = resampled)) + 
  geom_point() + 
  ylab("resampled_genes") +
  xlab("original_genes") + 
  scale_colour_manual(name =  "Resampled:", values = color_palette) +
  facet_wrap(~library, nrow = 1) + 
  theme_cowplot(font_size = 16, line_size = 1) +
  theme(
    legend.position = "top",
    legend.text = element_text(size = 16)
  )

ggplot(resampled_metadat, aes(og_mouse_genes,
                               n_mouse_genes,
                               colour = resampled)) + 
  geom_point() + 
  ylab("resampled_genes") +
  xlab("original_genes") + 
  scale_colour_manual(name =  "Resampled:", values = color_palette) +
  facet_wrap(~library, nrow = 1) + 
  theme_cowplot(font_size = 16, line_size = 1) +
  theme(
    legend.position = "top",
    legend.text = element_text(size = 16)
  )

Parse out new versus previously identified genes

calc_gene_sensitivity <- function(sc_obj, 
                                  type = "umi",
                                  mouse_gene_prefix = "mm38::",
                                  human_gene_prefix = "hg38::"){
  
  if (type == "umi"){
    count_matrix <- sc_obj$umi_matrix
  } else {
    count_matrix <- sc_obj$read_matrix
  }
  # generate list named with barcode of each detected gene and 
  # respective read/umi count
  genes_detected <- apply(count_matrix, 2, function(x) x[x > 0])
  sc_obj$genes_detected <- genes_detected
  sc_obj
}

sc_objs <- map(sc_objs, calc_gene_sensitivity)
sc_objs <- map(sc_objs, 
           function(x){
             og_genes <- sc_objs$original$genes_detected
             sub_genes <- x$genes_detected
             
             # subset list of cell barcodes to the same as the og experiment
             # and also reorders the barcodes to match
             sub_genes <- sub_genes[names(og_genes)]
             
             if(length(sub_genes) != length(og_genes)){
               stop("barcode lengths not the same")
             }
             shared_genes <- map2(sub_genes, 
                                  og_genes,
                                  ~intersect(names(.x),
                                             names(.y)))
             new_genes <- map2(sub_genes,
                               og_genes,
                               ~setdiff(names(.x),
                                        names(.y)))
             
             not_recovered_genes <- map2(og_genes,
                                         sub_genes,
                                         ~setdiff(names(.x),
                                                  names(.y)))
             x$shared_genes <- shared_genes
             x$new_genes <- new_genes
             x$not_recovered_genes <- not_recovered_genes
             return(x)
             })

## add gene recovery info to meta data table
sc_objs <- map(sc_objs, 
           function(x){
             shared_genes <- map2_dfr(x$shared_genes, 
                                names(x$shared_genes),
                                function(x, y){
                                  mouse <- sum(str_detect(x, "^mm38::")) ;
                                  human <- sum(str_detect(x, "^hg38::")) ;
                                  data_frame(cell_id = y,
                                            mouse_shared_genes = mouse,
                                            human_shared_genes = human,
                                            shared_genes = mouse + human)
                                 })
             
             not_recovered_genes <- map2_dfr(x$not_recovered_genes, 
                                names(x$not_recovered_genes),
                                function(x, y){
                                  mouse <- sum(str_detect(x, "^mm38::")) ;
                                  human <- sum(str_detect(x, "^hg38::")) ;
                                  data_frame(cell_id = y,
                                            mouse_not_recovered_genes = mouse,
                                            human_not_recovered_genes = human,
                                            not_recovered_genes = mouse + human)
                                 })
             
             new_genes <- map2_dfr(x$new_genes, 
                                names(x$new_genes),
                                function(x, y){
                                  mouse <- sum(str_detect(x, "^mm38::")) ;
                                  human <- sum(str_detect(x, "^hg38::")) ;
                                  data_frame(cell_id = y,
                                            mouse_new_genes = mouse,
                                            human_new_genes = human,
                                            new_genes = mouse + human)
                                 })
             gene_mdata <- left_join(shared_genes,
                                     not_recovered_genes,
                                     by = "cell_id") %>% 
               left_join(., new_genes, by = "cell_id")
             
             x <- add_metadata(x, gene_mdata)
             x
           })


resampled_metadat <- map(sc_objs, ~.x$meta_dat) %>% 
  bind_rows(.id = "library") %>% 
   mutate(library = factor(library, 
                          levels = c(
                               "original",
                               "standard",
                               "biotinylated",
                               "phosphorothioate")))
genes_recovered <- resampled_metadat %>% 
  dplyr::filter(library != "original") %>% 
  dplyr::select(cell_id, 
                library,
                resampled,
                shared_genes,
                not_recovered_genes, 
                new_genes)

genes_recovered <- gather(genes_recovered, 
                          key = type, value = count, 
                          -cell_id, -resampled, -library)
genes_recovered <- mutate(genes_recovered,
                          type = str_replace_all(type, "_", "\n"))

plt <- ggplot(genes_recovered, 
       aes(cell_id, count)) +
  geom_point(aes(color = resampled),
             size = 0.6,
             alpha = 0.75) +
  facet_grid(type ~ library) +
  theme(axis.text.x = element_blank(),
        axis.title.x = element_blank(),
        strip.text.y = element_text(size = 12,
                                    margin = margin(0,0.2,0,0.2, "cm"))) +
  scale_color_manual(values = color_palette)

plt 

save_plot("new_genes_detected.pdf", plt, base_width = 8, base_height = 8)

targeted <- genes_recovered %>% 
  dplyr::filter(library == "phosphorothioate",
                resampled) 
targeted
plt_dat <- genes_recovered %>% 
  dplyr::filter(library != "phosphorothioate",
                resampled) %>% 
  group_by(type) %>% 
  summarize(count = mean(count)) %>% 
  mutate(cell_id = "Not Targeted Barcodes") %>% 
  bind_rows(targeted, .) %>% 
  filter(type == "new\ngenes")


plt <- ggplot(plt_dat, 
       aes(cell_id, count)) +
  geom_bar(aes(fill = cell_id),
           stat = "identity") +
  labs(y = "Newly detected genes") + 
  theme(axis.title.x = element_blank(),
        axis.text.x = element_text(angle = 45,
                                   hjust = 1),
        legend.position = "none") +
  scale_fill_brewer(palette = "Set1")

save_plot("new_genes_barplot.pdf", plt, 
          base_width = 3.6, base_height = 5)


## species specific

genes_recovered <- resampled_metadat %>% 
     dplyr::filter(library == "phosphorothioate") %>% 
     select(cell_id, resampled, purity_reads, ends_with("genes")) %>% 
  mutate(species = ifelse(purity_reads > 0.80,
                          "human",
                          ifelse(purity_reads < 0.20, 
                                 "mouse",
                                 "mixed"))) %>% 
  filter(species != "mixed") %>% 
  mutate(shared_genes_species = ifelse(species == "human",
                                          human_shared_genes,
                                          mouse_shared_genes),
         new_genes_species = ifelse(species == "human",
                                    human_new_genes,
                                    mouse_new_genes),
         not_recovered_genes_species = ifelse(species == "human",
                                              human_not_recovered_genes,
                                              mouse_not_recovered_genes)) %>% 
  select(cell_id, resampled, ends_with("_species")) %>% 
  gather(key = type, value = count, 
         -cell_id, -resampled) %>% 
   mutate(type = str_replace(type, "_species", "") %>% 
            str_replace_all(., "_", "\n"))

targeted <- genes_recovered %>% 
  dplyr::filter(resampled) 

plt_dat <- genes_recovered %>% 
  dplyr::filter(!resampled) %>% 
  group_by(type) %>% 
  summarize(count = mean(count)) %>% 
  mutate(cell_id = "Not targeted\nbarcodes\n(mean)") %>% 
  bind_rows(targeted, .) %>% 
  mutate(type = ifelse(type == "new\ngenes",
                        "Newly\ndetected\ngenes",
                        ifelse(type == "shared\ngenes",
                               "Previously\ndetected\ngenes",
                               ifelse(type == "not\nrecovered\ngenes",
                                      "Previously\ndetected\ngenes\nnot recovered",
                                      NA))))


plt <- ggplot(plt_dat, 
       aes(cell_id, count)) +
  geom_bar(aes(fill = type),
           stat = "identity") +
  labs(y = "# of Genes") + 
  scale_x_discrete(labels = cell_names[[resampled_lib]]) + 
  scale_y_continuous(labels = scales::comma) + 
  scale_fill_brewer(palette = "Set1", name = "") +
  guides(fill = guide_legend(override.aes = list(size = 0.25))) +
  theme(axis.title.x = element_blank(),
        axis.text.x = element_text(angle = 45,
                                   hjust = 1),
        legend.position = "top",
        plot.margin = unit(c(5.5, 50.5, 5.5, 5.5), 
                           "points"))
      #  legend.key.size = unit(0.25, "pt")) 

save_plot("new_genes_species_specific_barplot.pdf", plt, 
          base_width = 4, base_height = 5) 

Plot expression per cell

MA plots

calc_ma <- function(xmat, ymat, cell = "GCAGTTAAGTGTCCAT"){
  x_rn <- rownames(xmat)
  y_rn <- rownames(ymat)
  xmat <- log2(xmat + 1)
  ymat <- log2(ymat + 1)
  
  rownames(xmat) <- x_rn
  rownames(ymat) <- y_rn
  m <- rowMeans(log2(((2^ymat + 2^xmat) / 2) + 1))
  a <- xmat[, cell] - ymat[, cell]
  data_frame(gene = names(a),
             mean_expression_log2 = m,
             log2_diff = a)
}

genes_to_plot <- rownames(sc_objs[[resampled_lib]]$umi_matrix)
cols <- colnames(sc_objs[[resampled_lib]]$norm_umi)
cell_ids <- cells[[resampled_lib]]

## append genes to reference library if necessary
ref_mat <- standardize_rows(sc_objs[[resampled_lib]]$umi_matrix[, cols],
                            sc_objs[[reflib]]$umi_matrix[, cols])

ma_dat <- map(cell_ids,
    ~calc_ma(sc_objs[[resampled_lib]]$umi_matrix[genes_to_plot, cols], 
             ref_mat[genes_to_plot, cols],
             cell = .x))

names(ma_dat) <- cell_ids
ma_dat <- bind_rows(ma_dat, .id = "cell")

plot_ma <- function(df){
  n_up <- filter(df, log2_diff > 0) %>% 
    group_by(cell) %>% 
    summarize(n = n(), n = paste0("up = ", n))
  
  n_down <- filter(df, log2_diff < 0) %>% 
    group_by(cell) %>% 
    summarize(n = n(), n = paste0("down = ", n))
  
  if (nrow(n_down) == 0) {
    n_down = data_frame(cell = df$cell %>% unique(),
                        n = "down = 0")
  }
  plt <- ggplot(df,
         aes(mean_expression_log2,
             log2_diff)) +
    geom_hline(aes(yintercept = 0), linetype = "dashed", colour = "grey") + 
    geom_point(size = 0.25) +
    geom_text(data = n_up, 
              aes(x = max(ma_dat$mean_expression_log2) * 0.66, 
                               y = max(ma_dat$log2_diff) * 1.2, 
                               label = n)) +
    geom_text(data = n_down, 
              aes(x = max(ma_dat$mean_expression_log2) * 0.66, 
                                 y = min(ma_dat$log2_diff) * 1.2, 
                                 label = n)) + 
    facet_wrap(~cell) +
    labs(x = expression(paste("Abundance (log"[2], ")")),
         y = expression(paste(frac("resampled","Original"), " (log"[2], ")")))
  plt
}

plt <- plot_ma(ma_dat)
plt

save_plot("per_cell_MA_plot_all_genes.pdf", plt, base_height = 6)

## Shared genes
ma_dat <- map(cell_ids,
  function(x){
    genes_to_plot <- sc_objs[[resampled_lib]]$shared_genes[[x]]
    calc_ma(sc_objs[[resampled_lib]]$umi_matrix[genes_to_plot,
                                                              cols],
            ref_mat[genes_to_plot, cols],
             cell = x)
})

names(ma_dat) <- cell_ids
ma_dat <- bind_rows(ma_dat, .id = "cell")
plt <- plot_ma(ma_dat)
plt

save_plot("per_cell_MA_plot_shared_genes.pdf", plt, base_height = 6)


## New genes
ma_dat <- map(cell_ids,
  function(x){
    genes_to_plot <- sc_objs[[resampled_lib]]$new_genes[[x]]
    calc_ma(sc_objs[[resampled_lib]]$umi_matrix[genes_to_plot,
                                                              cols],
            ref_mat[genes_to_plot, cols],
             cell = x)
})

names(ma_dat) <- cell_ids
ma_dat <- bind_rows(ma_dat, .id = "cell")
plt <- plot_ma(ma_dat)
plt

save_plot("per_cell_MA_plot_new_genes.pdf", plt, base_height = 6)

Histograms

get_expr <- function(xmat, ymat, cell = "GCAGTTAAGTGTCCAT"){
  xrows <- rownames(xmat)
  xmat <- log2(xmat[, cell] + 1)
  ymat <- log2(ymat[xrows, cell] + 1)
  data_frame(
    gene = xrows,
    resampled = xmat,
    original = ymat) %>% 
    gather(library, 
           Expression, -gene)
}

plot_histogram <- function(df){
  ggplot(df, 
         aes_string("Expression")) +
    geom_density(aes_string(fill = "library"),
                 alpha = 0.66) +
    scale_fill_viridis_d(name = "") +
    facet_wrap(~cell, nrow = 1) +
    theme(legend.position = "top",
          strip.text = element_text(size = 8)) 
}

expr_dat <- map(cell_ids,
  function(x){
    genes_to_plot <- names(sc_objs[[resampled_lib]]$genes_detected[[x]])
    get_expr(sc_objs[[resampled_lib]]$umi_matrix[genes_to_plot,
                                                              cols],
            ref_mat[genes_to_plot, cols],
             cell = x)
})

names(expr_dat) <- cell_ids
expr_dat <- bind_rows(expr_dat, .id = "cell")
expressed_in_resampled_plt <- plot_histogram(expr_dat)
expressed_in_resampled_plt

expr_dat <- map(cell_ids,
  function(x){
    genes_to_plot <- sc_objs[[resampled_lib]]$shared_genes[[x]]
    get_expr(sc_objs[[resampled_lib]]$umi_matrix[genes_to_plot,
                                                              cols],
            ref_mat[genes_to_plot, cols],
             cell = x)
})

names(expr_dat) <- cell_ids
expr_dat <- bind_rows(expr_dat, .id = "cell")
share_genes_plt <- plot_histogram(expr_dat)
share_genes_plt

expr_dat <- map(cell_ids,
  function(x){
    genes_to_plot <- sc_objs[[resampled_lib]]$new_genes[[x]]
    get_expr(sc_objs[[resampled_lib]]$umi_matrix[genes_to_plot,
                                                              cols],
            ref_mat[genes_to_plot, cols],
             cell = x)
})

names(expr_dat) <- cell_ids
expr_dat <- bind_rows(expr_dat, .id = "cell")
new_gene_plt <- plot_histogram(expr_dat)


plts <- list(
  expressed_in_resampled_plt,
  share_genes_plt,
  new_gene_plt
)

plt <- plot_grid(plotlist = plts, ncol = 1)
plt

save_plot("expression_histograms.pdf", plt, 
          ncol = 1, nrow = 3,
          base_height = 4,
          base_aspect_ratio = 2)

scatterplots

get_paired_expr <- function(xmat, ymat, cell = "GCAGTTAAGTGTCCAT"){
  xrows <- rownames(xmat)
  xmat <- log2(xmat[, cell] + 1)
  ymat <- log2(ymat[xrows, cell] + 1)
  data_frame(
    gene = xrows,
    resampled = xmat,
    original = ymat)
}

plot_scatter <- function(df){
  
  ggplot(df, 
         aes_string("original", "resampled")) +
    geom_point(size = 0.5) + 
    geom_abline(aes(slope = 1, intercept = 0)) +
    facet_wrap(~cell, nrow = 1) +
    expand_limits(x = 0, y = 0) +
    coord_fixed() + 
    scale_x_continuous(breaks = scales::pretty_breaks(n = 5)) +
    scale_y_continuous(breaks = scales::pretty_breaks(n = 5)) + 
    theme(legend.position = "top",
          strip.text = element_text(size = 8)) 
}

expr_dat <- map(cell_ids,
  function(x){
    genes_to_plot <- names(sc_objs[[resampled_lib]]$genes_detected[[x]])
    get_paired_expr(sc_objs[[resampled_lib]]$umi_matrix[genes_to_plot,
                                                              cols],
            ref_mat[genes_to_plot, cols],
             cell = x)
})

names(expr_dat) <- cell_ids
expr_dat <- bind_rows(expr_dat, .id = "cell")
expressed_in_resampled_plt <- plot_scatter(expr_dat)
expressed_in_resampled_plt

expr_dat <- map(cell_ids,
  function(x){
    genes_to_plot <- sc_objs[[resampled_lib]]$shared_genes[[x]]
    get_paired_expr(sc_objs[[resampled_lib]]$umi_matrix[genes_to_plot,
                                                              cols],
            ref_mat[genes_to_plot, cols],
             cell = x)
})

names(expr_dat) <- cell_ids
expr_dat <- bind_rows(expr_dat, .id = "cell")
share_genes_plt <- plot_scatter(expr_dat)
share_genes_plt

expr_dat <- map(cell_ids,
  function(x){
    genes_to_plot <- sc_objs[[resampled_lib]]$new_genes[[x]]
    get_paired_expr(sc_objs[[resampled_lib]]$umi_matrix[genes_to_plot,
                                                              cols],
            ref_mat[genes_to_plot, cols],
             cell = x)
})

names(expr_dat) <- cell_ids
expr_dat <- bind_rows(expr_dat, .id = "cell")
new_gene_plt <- ggplot(expr_dat, aes(cell,resampled)) +
  geom_jitter(alpha = 0.55) +
  geom_violin(aes(fill = cell)) +
  ylim(0, max(expr_dat$resampled) * 1.10) + 
  scale_fill_brewer(palette = "Set1") +
    theme(axis.text.x = element_text(angle = 90, 
                                     hjust = 1, vjust = 0.5),
          legend.pos = "none",
          axis.title.x = element_blank()) 

new_gene_plt

plts <- list(
  expressed_in_resampled_plt,
  share_genes_plt
)

plt <- plot_grid(plotlist = plts, ncol = 1)
plt

save_plot("expression_scatterplots.pdf", plt, 
          ncol = 1, nrow = 3,
          base_height = 4,
          base_aspect_ratio = 2)

save_plot("expression_newgenes_violinplots.pdf", new_gene_plt, 
          base_height = 6,
          base_aspect_ratio = 0.5)

Parse out new verses old UMIs

compare_umis <- function(path_to_ctrl,
                         path_to_test,
                         return_summary = F){

  ## umi seqs should be produced by ./get_molecule_info
  ctrl_umi_seqs <- read_tsv(path_to_ctrl,
                   col_names = c("barcode_10x", 
                                 "umi_molecule", 
                                 "count")) %>% 
  filter(barcode_10x != "Cell_unmatched")

  test_umi_seqs <- read_tsv(path_to_test,
                   col_names = c("barcode_10x", 
                                 "umi_molecule", 
                                 "count")) %>% 
  filter(barcode_10x != "Cell_unmatched")


  umi_seqs <- full_join(ctrl_umi_seqs, 
          test_umi_seqs, 
          by = c("barcode_10x", "umi_molecule"))
  
  if (return_summary) {
    umi_seqs %>% 
      mutate(new_umi = ifelse(is.na(count.x) & !is.na(count.y), 
                          1L, 
                          0L),
         not_detected_umi = ifelse(!is.na(count.x) & is.na(count.y),
                                   1L,
                                   0L),
         shared_umi = ifelse(!is.na(count.x) & !is.na(count.y),
                             1L,
                             0L)) %>% 
      group_by(barcode_10x) %>% 
      summarize(new_umis = sum(new_umi),
            not_detected_umis = sum(not_detected_umi),
            shared_umis = sum(shared_umi))
  } else {
    umi_seqs
  }
}

umi_files <- file.path(data_dir, libs, "umis", "umigroups.txt.gz")

umi_summaries <- map(umi_files[2:4],
                   ~compare_umis(umi_files[1], .x, return_summary = T))

names(umi_summaries) <- umi_files[2:4] %>% 
  str_split(., "/") %>% 
  map_chr(~.x[7])

umi_summary <- bind_rows(umi_summaries, .id = "library")

umis_recovered <- umi_summary %>% 
  gather(class, count, -barcode_10x, -library) 

## annotate with resampled or not
libs <- c("standard", "biotinylated","phosphorothioate")
cell_annot <- data_frame(barcode_10x = cells,
                         library = libs,
                         resampled = T) %>% 
  unnest()

umis_recovered <- umis_recovered %>% 
  left_join(., cell_annot, by = c("library", "barcode_10x")) %>% 
  mutate(resampled = ifelse(is.na(resampled),
                             F,
                             resampled),
         library = factor(library, levels = libs)) %>% 
  arrange(resampled)

plt <- ggplot(umis_recovered, 
       aes(barcode_10x, count)) +
  geom_point(aes(colour = resampled),
             size = 0.6,
             alpha = 0.75) +
  facet_grid(library ~ class) +
  theme(axis.text.x = element_blank(),
        axis.title.x = element_blank(),
        strip.text.y = element_text(size = 12,
                                    margin = margin(0,0.2,0,0.2, "cm"))) +
  scale_color_manual(values = color_palette)

plt 

umi_seqs <- map(umi_files[2:4],
                ~compare_umis(umi_files[1], .x, return_summary = F))

names(umi_seqs) <- umi_files[2:4] %>% 
  str_split(., "/") %>% 
  map_chr(~.x[7])

new_umis <- map2(umi_seqs, 
                split(cell_annot, cell_annot$library)[libs],
    ~filter(.x, 
       barcode_10x %in% .y$barcode_10x, 
       !is.na(count.y), 
       is.na(count.x))  %>% 
  separate(umi_molecule, c("seq", "genome", "gene"), sep = "::") %>% 
    select(-starts_with("count"))) 

old_umis <- map2(umi_seqs, 
                split(cell_annot, cell_annot$library)[libs],
    ~filter(.x, 
       barcode_10x %in% .y$barcode_10x, 
       !is.na(count.x),
       !is.na(count.y))  %>% 
  separate(umi_molecule, c("seq", "genome", "gene"), sep = "::") %>% 
    select(-starts_with("count"))) 


umi_edit_dist <- map2(new_umis,
                      old_umis,
                     ~left_join(.x, .y, 
               by = c("barcode_10x", "genome", "gene")) %>% 
  na.omit() %>% 
  mutate(ed = kentr::get_hamming_pairs(seq.x, seq.y)) %>% 
  group_by(barcode_10x, seq.y, genome, gene) %>% 
  summarize(min_ed = as.integer(min(ed))) %>% 
  ungroup())

umi_edit_dist <- bind_rows(umi_edit_dist, 
                           .id = "library")

cell_names_df <- map(cell_names, 
                      ~data_frame(old_id = names(.x), 
                                  new_id = .x)) %>% 
  bind_rows(., .id = "library")

umi_edit_dist_plt <- left_join(umi_edit_dist,
                               cell_names_df, 
                               by = c("barcode_10x" = "old_id",
                                      "library")) %>% 
  rename(cell_id = new_id)

plt <- ggplot(umi_edit_dist_plt, aes(cell_id, 
                          min_ed)) + 
  geom_boxplot(coef = Inf) +
  facet_wrap(~library, scales = "free_x", 
             labeller = labeller(library = lib_names)) +
  scale_y_continuous(breaks= scales::pretty_breaks()) +
  labs(y = "Minimum edit distance\noriginal vs. new UMIs") +
  theme(axis.text.x = element_text(angle = 90,
                                   hjust = 1, 
                                   vjust = 0.5),
        axis.title.x = element_blank())

save_plot("umi_edit_distance.pdf", plt,
          base_width = 8)

Original Expt

original library tSNE

library(Seurat)

mat <- sc_objs$original$umi_matrix
human_cell_ids <- sc_objs$original$meta_dat %>% 
  filter(og_purity_reads > 0.80) %>% 
  filter(str_detect(cell_id, "1Tumor|2LukGFP|3Brain|4Bone"),
         !str_detect(cell_id, "2LukGFPDead")) %>% 
  pull(cell_id)

mat <- mat[, human_cell_ids]
mat <- mat[str_detect(rownames(mat), "^hg38::"), ]

sobj <- CreateSeuratObject(mat, names.field = 2, names.delim = "_")

new_ids <- sobj@meta.data %>% 
  rownames_to_column("cell") %>% 
  mutate(new_id = str_sub(orig.ident, 2),
         new_id = ifelse(new_id == "LukGFP",
                         "Primary\ncells",
                         new_id),
         new_id = ifelse(new_id == "Tumor",
                         "Primary\ntumor",
                         new_id)) %>% 
  select(cell, new_id) %>% 
  as.data.frame(.) %>% 
  column_to_rownames("cell")

sobj <- AddMetaData(sobj, new_ids)
sobj <- SetAllIdent(sobj, "new_id")
sobj <- NormalizeData(sobj)
sobj <- ScaleData(sobj, vars.to.regress = "nUMI")
sobj <- FindVariableGenes(sobj, do.plot = F)
sobj <- RunPCA(sobj, pc.genes = rownames(sobj@data), pcs.compute = 5, do.print = F)
sobj <- RunTSNE(sobj, dims.use = 1:5, seed.use = 20180427)
sobj <- FindClusters(sobj, 
                     dims.use = 1:5, save.SNN = T, print.output = F)
cell_percents <- group_by(sobj@meta.data, new_id) %>% 
  summarize(n_cells = n()) %>% 
  mutate(total_cells = sum(n_cells), 
         percentage = 100 * (n_cells / total_cells)) %>% 
  select(-total_cells)

cell_percents
tsne_dat <- GetCellEmbeddings(sobj, "tsne") %>% 
  as.data.frame() %>% 
  tibble::rownames_to_column("cell") %>% 
  left_join(sobj@meta.data %>%
              as.data.frame() %>% 
              tibble::rownames_to_column("cell"),
            ., by = "cell") %>% 
  left_join(., cell_percents, by = "new_id") %>% 
  mutate(cell_labels_annotated = str_c(new_id, 
                            " (",
                            signif(percentage, 3),
                            "%)"))
tsne_plt <- ggplot(tsne_dat, 
  aes(tSNE_1, tSNE_2)) + 
  geom_point(aes(color = cell_labels_annotated), size = 1) + 
  scale_color_brewer(palette = "Set1",
                     name = "") +
  labs(title = "",
       x = "tSNE 1",
       y = "tSNE 2") +
  theme_cowplot() + 
  guides(color = guide_legend(override.aes = list(size = 4))) + 
  theme(legend.pos = "right",
        aspect.ratio = 1,
        legend.key.size = unit(2, 'lines')) 

save_plot("original_tsne.pdf", tsne_plt, 
          base_width = 5.5, 
          base_aspect_ratio = 1.5)

original library tSNE supplemented with resampled barcodes

mat <- sc_objs$original$umi_matrix

human_cell_ids <- sc_objs$original$meta_dat %>% 
  filter(str_detect(cell_id, "1Tumor|2LukGFP|3Brain|4Bone"),
         !str_detect(cell_id, "2LukGFPDead")) %>% 
  pull(cell_id)

mat <- mat[, human_cell_ids]

resampled_ids <- sc_objs$phosphorothioate$meta_dat %>% 
  filter(resampled) %>% 
  pull(cell_id)

resampled_mat <- sc_objs$phosphorothioate$umi_matrix[, resampled_ids]
colnames(resampled_mat) <- str_c(colnames(resampled_mat), "::", "resampled")

mat <- as.data.frame(as.matrix(mat)) %>% rownames_to_column("cell")
resampled_mat <- as.data.frame(as.matrix(resampled_mat)) %>% rownames_to_column("cell")
combined_mats <- left_join(mat, resampled_mat, by = c("cell")) 
combined_mats <- as.data.frame(combined_mats) %>% 
  column_to_rownames("cell") %>% 
  as.matrix() %>% 
  as(., "sparseMatrix")   

combined_mats[is.na(combined_mats)] <- 0

sobj <- CreateSeuratObject(combined_mats, names.field = 2, names.delim = "_")

new_ids <- sobj@meta.data %>% 
  rownames_to_column("cell") %>% 
  mutate(new_id = str_sub(orig.ident, 2),
         new_id = ifelse(str_detect(new_id, "LukGFP"),
                         "Primary\ncells",
                         new_id),
         new_id = ifelse(str_detect(new_id, "Tumor"),
                         "Primary\ntumor",
                         new_id),
         resampled = ifelse(str_detect(cell, "resampled"),
                             "resampled",
                             "not resampled"))

resampled_cell_ids <- new_ids[new_ids$resampled == "resampled", "cell"] %>% 
  str_replace("::resampled", "")
 

## add in computed purity and other values

mdata_og <- sc_objs$phosphorothioate$meta_dat %>% 
  filter(cell_id %in% rownames(sobj@meta.data))

resampled_mdata <- filter(mdata_og, resampled) %>% 
  select(cell_id, og_purity_reads = purity_reads) %>% 
  mutate(cell_id = str_c(cell_id, "::resampled"))

mdata_og <- mdata_og %>% 
  select(cell_id, og_purity_reads) %>% 
  bind_rows(., resampled_mdata) %>% 
  as.data.frame() %>% 
  column_to_rownames("cell_id")

new_ids <- mutate(new_ids, 
                  resampled = ifelse(cell %in% resampled_cell_ids, 
                                      "original cell",
                                      resampled)) %>% 
  select(cell, new_id, resampled) %>% 
  as.data.frame(.) %>% 
  column_to_rownames("cell")

sobj <- AddMetaData(sobj, new_ids)
sobj <- AddMetaData(sobj, mdata_og)
sobj <- SetAllIdent(sobj, "new_id")
sobj <- NormalizeData(sobj)
sobj <- ScaleData(sobj, vars.to.regress = "nUMI")
sobj <- FindVariableGenes(sobj)

sobj <- RunPCA(sobj, pc.genes = rownames(sobj@data))
sobj <- RunTSNE(sobj, dims.use = 1:5, seed.use = 20180502)

tsne_plt <- TSNEPlot(sobj, colors.use = brewer.pal(7, "Set1")) + 
  labs(title = "Breast Cancer PDX") +
  theme(legend.position = "bottom")

resampled_cells <- TSNEPlot(SetAllIdent(sobj, 
                                         "resampled"), 
                             colors.use = c("lightgrey",
                                            brewer.pal(7, "Set1")[1:2])) + 
  labs(title = "Breast Cancer PDX") +
  theme(legend.position = "bottom")

tmp <- resampled_cells$data 

og_cell_dat <- tmp[cells$phosphoro, ] %>%
  rownames_to_column("cell")

resampled_cell_dat <- tmp[str_c(cells$phosphoro, "::resampled"), ] %>%
  rownames_to_column("cell") %>% 
  mutate(cell = str_replace(cell, "::resampled", "")) %>% 
  select(cell, tSNE_1, tSNE_2)

cell_dat <- left_join(og_cell_dat, 
                      resampled_cell_dat, 
                      by = "cell", 
                      suffix = c("", "_resampled"))


resampled_cells <- TSNEPlot(SetAllIdent(sobj, 
                                         "resampled"), 
                             colors.use = c("lightgrey",
                                            brewer.pal(7, "Set1")[1:2])) + 
                  geom_segment(data = cell_dat, 
                               aes(x = tSNE_1,
                                   y = tSNE_2, 
                                   xend = tSNE_1_resampled,
                                   yend = tSNE_2_resampled),
                               linejoin = "mitre",
                               arrow = arrow(length = unit(0.03, "npc"))) + 
  labs(title = "Breast Cancer PDX") +
  theme(legend.position = "bottom")

resampled_cells_purity <- FeaturePlot(sobj,
                                       features.plot = "og_purity_reads",
                        no.legend = F, 
                        do.return = T)$og_purity_reads +  
                  geom_segment(data = cell_dat, 
                               aes(x = tSNE_1,
                                   y = tSNE_2, 
                                   xend = tSNE_1_resampled,
                                   yend = tSNE_2_resampled),
                               linejoin = "mitre",
                               arrow = arrow(length = unit(0.01, "npc"))) + 
  labs(title = "Breast Cancer PDX") +
  theme(legend.position = "bottom")

plt <- plot_grid(tsne_plt, 
                 resampled_cells,
                 resampled_cells_purity,
                 nrow = 1)

save_plot("resampled_tsne.pdf", plt, 
          nrow = 1, ncol = 3,
          base_height = 5.5, base_width = 4.25)

kNN analysis

Find the k-nearest neighboors in PCA and gene expression space

## use combined data from above
data.use <- GetCellEmbeddings(object = sobj,
                              reduction.type = "pca",
                              dims.use = 1:20)

## find top 10 nearest neighboors using exact search
knn <- RANN::nn2(data.use, k = 5,
                 searchtype = 'standard',
                 eps = 0)

resampled_idxs <- knn$nn.idx[str_detect(rownames(data.use), "::resampled"), ]

nn_ids <- as_data_frame(t(apply(resampled_idxs, 1,
                      function(x)rownames(data.use)[x])))

colnames(nn_ids) <- c("query_cell", paste0("nearest neighbor ", 1:(ncol(nn_ids) - 1)))

nn_ids

Selected barcodes

sc_metadat <- map(sc_objs["original"], ~.x$meta_dat) %>% 
  bind_rows(.id = "library") %>% 
  filter(str_detect(cell_id, "1Tumor|2LukGFP|3Brain|4Bone"),
         !str_detect(cell_id, "2LukGFPDead")) %>% 
  mutate(cell_type = str_split(cell_id, "_", simplify = T)[, 2],
         cell_type =  str_sub(cell_type, 2),
         cell_type = ifelse(cell_type == "LukGFP",
                         "Primary\ncells",
                         cell_type),
         cell_type = ifelse(cell_type == "Tumor",
                         "Primary\ntumor",
                         cell_type)) 

sc_metadat <- arrange(sc_metadat, resampled)
a <- ggplot(sc_metadat, aes(cell_id, total_reads)) +
  geom_point(aes(color = resampled), pt.size = 0.75) +
  labs(x = "Cell",
       y = "Sequencing Depth") + 
  scale_y_log10(labels = scales::comma, breaks = c(1e3, 1e4,1e5,1e6)) +
  scale_color_manual(values = color_palette) +
  theme(axis.text.x = element_blank(),
        axis.ticks.x = element_blank(),
        axis.title.x = element_blank(),
        legend.position = "top")


b <- ggplot(sc_metadat, aes(cell_id, total_reads)) +
  geom_tile(aes(cell_id, 
                y = -.1, 
                height = 0.05 , fill = cell_type)) +
  scale_fill_brewer(palette = "Set1", name = "Cell Type") + 
  theme(
    axis.text = element_blank(),
    axis.ticks = element_blank(),
    axis.title = element_blank(),
    axis.line = element_blank(),
    legend.position = "bottom",
    plot.margin=unit(c(0,0,0,0), "lines")
  )

g1 <- ggplotGrob(a)
g2 <- ggplotGrob(b)
maxWidth = grid::unit.pmax(g1$widths, g2$widths)
g1$widths <- as.list(maxWidth)
g2$widths <- as.list(maxWidth)
grid::grid.newpage()
plt <- gridExtra::grid.arrange(gridExtra::arrangeGrob(g1, g2, nrow = 2 , heights=c(.9, .175)))

grid::grid.newpage()

save_plot("selected_cells.pdf", plt, base_height = 5)

Species specificity

resampled_metadat <- map(sc_objs, ~.x$meta_dat) %>% 
  bind_rows(.id = "library") %>% 
   mutate(library = factor(library, 
                          levels = unlist(libs))) %>% 
  arrange(resampled)

cell_names_df <- map(cell_names, 
                      ~data_frame(old_id = names(.x), 
                                  new_id = .x)) %>% 
  bind_rows(., .id = "library")

plt_dat <- resampled_metadat %>% 
  filter(resampled, library != reflib) %>% 
  left_join(.,
            cell_names_df, by = c("cell_id" = "old_id",
                                  "library")) %>% 
  select(new_id, library, contains("purity")) %>% 
  gather(type, purity, -new_id, -library) %>% 
  mutate(type = ifelse(str_detect(type, "^og"),
                       str_replace(type, "^og_", "Original_"),
                       str_c("Resampled_", type)),
         library = factor(library, levels = resampled_libs)) %>% 
  separate(type, c("resampled", "not_import", "count_type"), sep = "_") %>% 
  select(-not_import)

plt_dat <- mutate(plt_dat,
                  count_type = ifelse(count_type == "reads",
                                      "Reads",
                                      "UMIs"))
plt <- ggplot(plt_dat, aes(new_id, purity)) +
  geom_hline(aes(yintercept = 0.80), linetype = "dashed", 
             color = "grey", alpha = 0.75) +
  geom_hline(aes(yintercept = 0.20),  linetype = "dashed", 
             color = "grey", alpha = 0.75) +
  geom_point(aes(color = resampled), 
             position = position_dodge(width = 0.5)) +

  facet_wrap(count_type~library, 
             labeller = labeller(library = lib_names),
             scales = "free_x", nrow = 1) +
  scale_color_brewer(palette = "Set1", name = "") +
  guides(color = guide_legend(override.aes = list(size = 5))) + 
  labs(x = "",
       y = expression(paste( "Species Purity ", 
                             frac("Human", "Human + Mouse")))) +
  theme(axis.text.x = element_text(angle = 90,
                                   hjust = 1,
                                   vjust = 0.5),
        legend.pos = "top")


save_plot("species_specificity.pdf", plt,
          base_width = 12, base_height= 6.5)